Blaine Grayson
09-23-2025
HockeyR and packages
library(hockeyR)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading and combining data play-by-play-data
pbp20 = load_pbp('2020-21')
pbp21 = load_pbp('2021-22')
pbp22 = load_pbp('2022-23')
pbp23 = load_pbp('2023-24')
pbp_train <- bind_rows(pbp20, pbp21, pbp22)
pbp_test <- pbp23
pbp_test <- pbp_test %>%
dplyr::select(-secondary_type) %>%
dplyr::rename(secondary_type = shotType)
pbp_test <- pbp_test |>
dplyr::mutate(
secondary_type = dplyr::case_when(
secondary_type == "wrist" ~ "Wrist Shot",
secondary_type == "backhand" ~ "Backhand",
secondary_type == "snap" ~ "Snap Shot",
secondary_type == "slap" ~ "Slap Shot",
secondary_type == "tip-in" ~ "Tip-In",
secondary_type == "wrap-around" ~ "Wrap-around",
secondary_type == "deflected" ~ "Deflected",
secondary_type == "poke" ~ "Poke",
secondary_type == "bat" ~ "Batted",
secondary_type == "between-legs" ~ "Between Legs",
secondary_type == "cradle" ~ "Cradle",
TRUE ~ secondary_type
)
)
Extracting shot types for events in shots and goals
shots_train <- pbp_train |>
dplyr::filter(event_type %in% c("SHOT", "GOAL")) |>
dplyr::select(event_type,secondary_type,event_player_1_name, xg) |>
dplyr::mutate(Y = ifelse(event_type == "GOAL",1,0))
Getting shooting % for each shot type
shots_summary <- shots_train |>
dplyr::group_by(secondary_type) |>
dplyr::summarize(
shots = n(),
goals = sum(Y),
shooting_pct = mean(Y),
.groups = "drop"
)
Attaching predictions back to each shot in the test set
shots_test <- pbp_test |>
dplyr::filter(event_type %in% c("SHOT", "GOAL")) |>
dplyr::select(event_type, secondary_type, event_player_1_name) |>
dplyr::mutate(Y = ifelse(event_type == "GOAL", 1, 0)) |>
left_join(shots_summary, by = "secondary_type")
Computing goals over expected (GOE) by player (model 1)
goe_model1 <-
shots_test |>
dplyr::mutate(diff = Y - shooting_pct) |>
dplyr::group_by(event_player_1_name) |>
dplyr::summarise(
player = unique(event_player_1_name),
GOE = round(sum(diff),3),
shots = dplyr::n(),
goals = sum(Y)
) |>
dplyr::select(player, shots, goals, GOE) |>
dplyr::arrange(dplyr::desc(GOE))
goe_model1
Calculating Log Loss for Model 1 and NHL xG
logloss <- function(y, phat){
keep <- !is.na(y) & !is.na(phat)
y <- y[keep]
phat <- phat[keep]
if(any(phat < 1e-12)) phat[phat < 1e-12] <- 1e-12
if(any(phat > 1 - 1e-12)) phat[phat > 1 - 1e-12] <- 1 - 1e-12
return(-1 * mean(y * log(phat) + (1 - y) * log(1 - phat)))
}
cat("Shot Type Log Loss:",
round(logloss(shots_test$Y, shots_test$shooting_pct), 3), "\n")
## Shot Type Log Loss: 0.329
cat("NHL xG Log Loss:",
round(logloss(shots_train$Y, shots_train$xg), 3), "\n")
## NHL xG Log Loss: 0.269
Model 2
Preparing data for model 2
pbp_model2 = pbp_train |>
select(xg, season, description, event_type, event, secondary_type,shot_distance, shot_angle, event_player_1_name, event_player_1_id) |>
filter(event_type %in% c("SHOT", "GOAL")) |>
mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_model2
Fitting a linear regression model that will predict on distance, angle, and shot type
model2 = glm(formula = Y~shot_distance + shot_angle + secondary_type, data = pbp_model2, family = binomial("logit"))
model2
##
## Call: glm(formula = Y ~ shot_distance + shot_angle + secondary_type,
## family = binomial("logit"), data = pbp_model2)
##
## Coefficients:
## (Intercept) shot_distance
## -0.97348 -0.03798
## shot_angle secondary_typeBatted
## -0.01317 0.72395
## secondary_typeBetween Legs secondary_typeCradle
## 0.03674 1.24339
## secondary_typeDeflected secondary_typePenalty Shot
## 0.24263 1.12735
## secondary_typePoke secondary_typeSlap Shot
## 0.02471 0.48893
## secondary_typeSnap Shot secondary_typeTip-In
## 0.50279 0.17230
## secondary_typeWrap-around secondary_typeWrist Shot
## -0.55114 0.22909
##
## Degrees of Freedom: 234252 Total (i.e. Null); 234239 Residual
## (29 observations deleted due to missingness)
## Null Deviance: 152500
## Residual Deviance: 142800 AIC: 142800
Adding Predicted xG from model2
shot_test2 <- pbp_test |>
dplyr::filter(event_type %in% c("SHOT", "GOAL")) |>
dplyr::select(description, event_type, event, secondary_type,shot_distance, shot_angle, event_player_1_name, event_player_1_id) |>
dplyr::mutate(Y = ifelse(event_type == "GOAL", 1, 0)) |>
left_join(shots_summary, by = "secondary_type")
shot_test2 = shot_test2 |> mutate(xg2 = predict(model2, newdata = shot_test2, type="response")) |> drop_na(xg2)
Computing GOE per player based on model 2
shot_test2 = shot_test2 |> mutate(GOE_ON_PLAY = Y - xg2)
GOE_Leaders <- shot_test2 |>
group_by(event_player_1_name) |>
summarise(
GOE = sum(GOE_ON_PLAY),
shots = n(),
goals = sum(Y),
.groups = "drop"
) |>
mutate(GOE = round(GOE, 3)) |>
select(player = event_player_1_name, goals, shots, GOE) |>
arrange(desc(GOE))
GOE_Leaders
Log Loss for Model 2
cat("Model 2 Log Loss:",
round(logloss(shot_test2$Y, shot_test2$xg2), digits=3),"\n")
## Model 2 Log Loss: 0.309
cat("NHL xG Log Loss:",
round(logloss(shots_train$Y, shots_train$xg), 3), "\n")
## NHL xG Log Loss: 0.269
Model 3
Preparing data for model 3
pbp_filtered = pbp_train |>
select(xg, season, description, event_type, event, secondary_type, event_goalie_name, event_goalie_id, strength_code, strength, shot_distance, shot_angle, event_player_1_name, event_player_1_id) |>
filter(event_type %in% c("SHOT", "GOAL"))
pbp_filtered
Adding a column that tracks a goalies save percentage for that year
goalies_train = pbp_filtered |> group_by(event_goalie_name, season) |>
summarize(sv_pct = sum(event_type == "SHOT") / n())
## `summarise()` has grouped output by 'event_goalie_name'. You can override using
## the `.groups` argument.
pbp_final_train = left_join(pbp_filtered, goalies_train, by = c("event_goalie_name", "season"))
pbp_final_train = pbp_final_train |> mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_final_train
Fitting logistic regression model3
model3 = glm(formula = Y~shot_distance + shot_angle + secondary_type + sv_pct + strength, data = pbp_final_train, family = binomial("logit"))
model3
##
## Call: glm(formula = Y ~ shot_distance + shot_angle + secondary_type +
## sv_pct + strength, family = binomial("logit"), data = pbp_final_train)
##
## Coefficients:
## (Intercept) shot_distance
## 11.11790 -0.05388
## shot_angle secondary_typeBatted
## -0.01233 0.70745
## secondary_typeBetween Legs secondary_typeCradle
## -0.09129 1.31750
## secondary_typeDeflected secondary_typePenalty Shot
## 0.32361 1.17451
## secondary_typePoke secondary_typeSlap Shot
## -0.15978 0.94691
## secondary_typeSnap Shot secondary_typeTip-In
## 0.75162 0.21205
## secondary_typeWrap-around secondary_typeWrist Shot
## -0.59727 0.36483
## sv_pct strengthPower Play
## -13.33082 0.33313
## strengthShorthanded
## 0.19792
##
## Degrees of Freedom: 233568 Total (i.e. Null); 233552 Residual
## (713 observations deleted due to missingness)
## Null Deviance: 151000
## Residual Deviance: 130500 AIC: 130500
Preparing test data for model3
pbp_filtered_test <- pbp_test |>
dplyr::select(description, event_type, event, secondary_type,
event_goalie_name, event_goalie_id, strength_code, strength,
shot_distance, shot_angle, event_player_1_name, event_player_1_id) |>
dplyr::filter(event_type %in% c("SHOT", "GOAL")) |>
dplyr::mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_filtered_test
calculating goalie save percentage for test set
goalies_test = pbp_filtered_test |> group_by(event_goalie_name) |>
summarize(sv_pct = sum(event_type == "SHOT") / n())
pbp_final_test = left_join(pbp_filtered_test, goalies_test, by = c("event_goalie_name"))
pbp_final_test = pbp_final_test |> mutate(Y = ifelse(event_type == "GOAL", 1, 0))
pbp_final_test
Adding predicted xG from model 3 and Computing GOE per player
pbp_final_test = pbp_final_test |> mutate(xg3 = predict(model3, newdata = pbp_final_test, type="response")) |> drop_na(xg3)
pbp_final_test = pbp_final_test |> mutate(GOE_ON_PLAY = Y - xg3)
GOE_Leaders_final <- pbp_final_test |>
group_by(event_player_1_name) |>
summarise(
GOE = sum(GOE_ON_PLAY),
shots = n(),
goals = sum(Y),
.groups = "drop"
) |>
mutate(GOE = round(GOE, 3)) |>
select(player = event_player_1_name, goals, shots, GOE) |>
arrange(desc(GOE))
GOE_Leaders_final
Log Loss model 3
cat("Model 3 Log Loss:",
round(logloss(pbp_final_test$Y, pbp_final_test$xg3), digits=3),"\n")
## Model 3 Log Loss: 0.269
cat("NHL xG Log Loss:",
round(logloss(shots_train$Y, shots_train$xg), 3), "\n")
## NHL xG Log Loss: 0.269
Forest Data
shot_vars =
c("Y", "secondary_type", "strength", "shot_distance", "shot_angle", "sv_pct")
train_data_tree <- pbp_final_train |>
dplyr::select(all_of(shot_vars))
y_train = train_data_tree$Y
model_3_tree = ranger::ranger(formula = Y~., data = train_data_tree, probability = TRUE)
## Growing trees.. Progress: 85%. Estimated remaining time: 5 seconds.
pbp_test_tree <- pbp_final_test |>
mutate(xg_model3 = predict(object = model_3_tree, data = pbp_final_test)$predictions[,2])
goe_model3 = pbp_test_tree |>
dplyr::mutate(diff = Y - xg_model3) |>
dplyr::group_by(event_player_1_name) |>
dplyr::summarise(
GOE = round(sum(diff),3),
shots = n(),
goals = sum(Y)
) |>
dplyr::arrange(desc(GOE))
goe_model3
Player Testing
goe_model3 |> filter(event_player_1_name == "Matthew Tkachuk")
cat("Model 3 Log Loss:",
round(logloss(pbp_test_tree$Y, pbp_test_tree$xg_model3), digits=3),"\n")
## Model 3 Log Loss: 0.279